#!/usr/bin/env env_indeed2
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 14 14:21:17 2022

@author: galg
"""


import pandas as pd
import numpy as np
import math 
import seaborn as sns
import os,sys
import os.path
from datetime import datetime

# CHANGE
## PATH
data_files = '' # raw data
path_data = '' # analysis data


## HISTORICAL FILES

history1 = pd.read_csv(data_files+'CA_job_level_000000000001_20201009.csv')
history2 = pd.read_csv(data_files+'CA_job_level_000000000000_20201009.csv')

print('Min history1 = ' + history1['date_first_visible'].min())
print('Max history1 = ' + history1['date_first_visible'].max())

print('Min history2 = ' + history2['date_first_visible'].min())
print('Max history2 = ' + history2['date_first_visible'].max())
      
max_historical = pd.read_csv(data_files+'past28_CA_job_level_000000000000_20201014.csv').date_first_visible.min()
print('Min file series = ' + max_historical)
history1 = history1[history1['date_first_visible']<max_historical]
history2 = history2[history2['date_first_visible']<max_historical]

year = 2020
files_list = [file for file in os.listdir(data_files)
              if file.startswith(f'past28_CA_job_level_000000000000_{year}')]
year = 2021
files_list = files_list + [file for file in os.listdir(data_files)
              if file.startswith(f'past28_CA_job_level_000000000000_{year}')]
files_list = files_list + [file for file in os.listdir(data_files)
              if file.startswith(f'CA_job_level_{year}')]
files_list = files_list + [file for file in os.listdir(data_files)
              if file.startswith(f'CA_job_postings_{year}')]
files_list = files_list + [file for file in os.listdir(data_files)
              if file.startswith(f'CA_Job_Postings_{year}')]
files_list = files_list + [file for file in os.listdir(data_files)
              if file.startswith(f'CA_Job_postings_{year}')]

year = 2022
files_list = files_list + [file for file in os.listdir(data_files)
              if file.startswith(f'CA_job_postings_{year}')]

year = 2023
files_list = files_list + [file for file in os.listdir(data_files)
              if file.startswith(f'CA_job_postings_{year}')]

year = 2024
files_list = files_list + [file for file in os.listdir(data_files)
              if file.startswith(f'CA_job_postings_{year}')]

files_list.sort(key=lambda x: os.path.getmtime(data_files + x), reverse=True)
files_list

dfs = []
first_date = f'{year+1}-01-01'
for file in files_list:
    #df = pd.read_csv(data_files + file)
    #print(df.date_first_visible.min())
    #print(df['jladmin1code'].value_counts())
    try:
        df = pd.read_csv(data_files + file, encoding = 'utf8')
        try:
            df.drop(columns='Unnamed: 0', inplace=True)
        except:
            pass
    except:
        print(file)
        df = pd.read_csv(data_files + file, encoding ='latin1')  
        try:
            df.drop(columns='Unnamed: 0', inplace=True) 
        except:
            pass
    dfs.append(df[df.date_first_visible < first_date])
    first_date = df.date_first_visible.min()

df = pd.concat(dfs, axis=0)
df = pd.concat([history1, history2, df], axis=0)
df
print(df['jladmin1code'].value_counts())

#############################################
# keep only the ones with provinces  
provinces = ['ab', 'bc', 'mb', 'nb', 'nl', 'ns', 'on', 'pe', 'sk', 'nt', 'nu', 'yt','qc']

df = df[df['jladmin1code'].isin(provinces)]

# clean data
#df.job_tilte = str(df.job_title)
#df.company_name = str(df.company_name)   
df = df.dropna(subset=['job_title']) # if empty job title, it's useless for us bc can't do classification
df = df.drop(columns=['jladmin2code', 'jladmin3code'])
df.dtypes

# clean data since 01/01/2023 as I will have the classified data
print('Max date = ' + df['date_first_visible'].max())
df=df[df['date_first_visible']>'2023-12-21']  ## HERE IS THE DATE TO CHANGE EACH TIME !!! Max last day =  2024-11-08
################################
# ISUE WITH THE COMPANY NAME DATA
# open duration data
duration = pd.read_parquet(path_data+'dfdur.parquet') # change this file !!!

duration.first_date = pd.to_datetime(duration.first_date)
duration.last_date = pd.to_datetime(duration.last_date)

# attach the company name data, where missing, from the duration data
print(df.shape)
print(df.columns)
df = pd.merge(df, duration[['job_key','company_name']], on='job_key', how='left', suffixes=('', '_dur'))
print(df.shape)
print(df.columns)
df['company_name'] = df['company_name'].fillna(df['company_name_dur'])

# drop using columns
df.drop(['company_name_dur'], axis=1, inplace=True)

# check if null company name:
df['no_company'] = df['company_name'].isna()
print(df['no_company'].value_counts(normalize=True))

###############################
# make non-quebec csv 
non_quebec = df[df['jladmin1code']!='qc']
non_quebec = non_quebec[~non_quebec['job_title'].str.contains('à|è|ì|ò|ù|ç|â|ê|î|ô|û|ë|ï|ü|é|É|À|Â|Ä|È|Ê|Ë|Î|Ì|Ï|Ò|Ö|Ô|Ù|Û|Ü|Ÿ|Ç')]

# make quebec csv
quebec = df[(df['jladmin1code']=='qc')|(df['job_title'].str.contains('à|è|ì|ò|ù|ç|â|ê|î|ô|û|ë|ï|ü|é|É|À|Â|Ä|È|Ê|Ë|Î|Ì|Ï|Ò|Ö|Ô|Ù|Û|Ü|Ÿ|Ç'))]

## clean french csv

select_punct = set('!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~0123456789') #only removed "'"
def replace_punctuation(s):
    """
    Takes string as input.
    Removes punctuation from a string if the character is in select_punct.
    Returns a string.

   >>> replace_punctuation('sales executives/ - london')
   'sales executives   london'
    """
    for i in set(select_punct):
        if i in s:
            s = s.replace(i, ' ')

    return s

select_char = set('éàèìòùçâêîôûëïü')
def special_char(s):
    """
    Takes string as input.
    Replaces special characters for the same letter without accent.
    Returns a string.

   >>> replace_punctuation('mémbres')
   'membres'
    """
    for i in set(select_char):
        if i in s:
            #s = s.replace("é", "e")
            s = s.replace("é", "e")
            s = s.replace("è", "e")
            s = s.replace("ê", "e")
            s = s.replace("ë", "e")
            s = s.replace("à", "a")
            s = s.replace("â", "a")
            s = s.replace("ï", "i")
            s = s.replace("ì", "i")
            s = s.replace("î", "i")
            s = s.replace("ò", "o")
            s = s.replace("ô", "o")
            s = s.replace("ù", "u")
            s = s.replace("ü", "u")
            s = s.replace("û", "u")
            s = s.replace("ç", "c")

    return s


quebec['job_title'] = quebec['job_title'].str.lower()
quebec['job_title'] = quebec['job_title'].apply(lambda x: replace_punctuation(x))
quebec['job_title'] = quebec['job_title'].apply(lambda x: special_char(x))

## theres missing values in quebec so needs a lil extra in the function
quebec['company_name'] = quebec['company_name'].str.lower()
quebec['company_name'] = quebec['company_name'].apply(lambda x: special_char(x) if(np.all(pd.notnull(x))) else x)
quebec['company_name'] = quebec['company_name'].apply(lambda x: replace_punctuation(x) if(np.all(pd.notnull(x))) else x)
## save files 
#quebec.to_csv(path_data+"quebec.csv")
#non_quebec.to_csv(path_data+"nonquebec.csv")

## verifying that we have the right dates
print("Min date Non-QC = " + non_quebec['date_first_visible'].min())
print("Max date Non-QC = " + non_quebec['date_first_visible'].max())
print("Min date QC = " + quebec['date_first_visible'].min())
print("Max date QC = " + quebec['date_first_visible'].max())

## save the data
non_quebec.to_csv(path_data + 'non_quebec.csv')
quebec.to_csv(path_data + 'quebec.csv')


